####################################
#Preparing the expression table
####################################
## Set working directory for first rna_batch of data and import rsem data ##
#dir = "/sc/hydra/projects/pd-omics/mynd/data/PBG-2097"
#samples = read.table("/sc/hydra/projects/pd-omics/mynd/qc/samples.rna_batch1", header = F, check.names = F)
#sample = samples$V1
#files = file.path(dir, sample, "Processed/RAPiD/rsem", paste0(sample, ".genes.results"))
#names(files) = sample
#txi.rsem = tximport(files, type = "rsem")
#counts1 = data.frame(txi.rsem$counts, check.names = F)
#tpms1 = data.frame(txi.rsem$abundance, check.names = F)
## Repeat for the second rna_batch of data ##
#dir = "/sc/hydra/projects/pd-omics/mynd/data/PBG-2022"
#samples = read.table("/sc/hydra/projects/pd-omics/mynd/qc/samples.rna_batch2", header = F, check.names = F)
#sample = samples$V1
#files = file.path(dir, sample, "Processed/RAPiD/rsem", paste0(sample, ".genes.results"))
#names(files) = sample
#txi.rsem1 = tximport(files, type = "rsem")
#counts2 = data.frame(txi.rsem1$counts, check.names = F)
#tpms2 = data.frame(txi.rsem1$abundance, check.names = F)
## We need to change sample names because of overlap between rna_batches. We will append the rna_batch name to the sample ID in the count matrix and then merge the two datasets ##
#colnames(counts1) <- paste(colnames(counts1), "1", sep = "_")
#colnames(tpms1) <- paste(colnames(tpms1), "1", sep = "_")
#colnames(counts2) <- paste(colnames(counts2), "2", sep = "_")
#colnames(tpms2) <- paste(colnames(tpms2), "2", sep = "_")
#dim(counts1)
#dim(counts2)
#dim(tpms1)
#dim(tpms2)
#tot_counts = merge(counts1, counts2, by = 0)
#rownames(tot_counts) = tot_counts$Row.names
#tot_counts$Row.names = NULL
#tot_tpms = merge(tpms1, tpms2, by = 0)
#rownames(tot_tpms) = tot_tpms$Row.names
#tot_tpms$Row.names = NULL
#dim(tot_tpms)
#dim(tot_counts)
#counts = tot_counts
counts = read.table('/sc/hydra/projects/pd-omics/mynd/qc/v1/mynd.filtered.counts.txt', header = T, check.names = F)
#tpms = tot_tpms
tpms = read.table('/sc/hydra/projects/pd-omics/mynd/qc/v1/mynd.filtered.tpms.txt', header = T, check.names = F)
## We need to then do the same for metadata, but rna_batch 1 and 2 are rnaseq rna_batch 1, and rna_batch 3 here, is rna_batch2 ... ##
mynd_metadata = read.table('/sc/hydra/projects/pd-omics/mynd/qc/v1/filtered_metadata.txt', row.names = 1, header = T, check.names = F)
#rownames(mynd_metadata) = paste(mynd_metadata$Sample, mynd_metadata$rna_batch, sep = "_")
#rownames(mynd_metadata) = gsub("_2", "_1", rownames(mynd_metadata))
#rownames(mynd_metadata) = gsub("_3", "_2", rownames(mynd_metadata))
#rownames(mynd_metadata) %in% colnames(tpms)
mynd_metadata = mynd_metadata[colnames(counts),]
mynd_metadata$rna_batch = as.factor(mynd_metadata$rna_batch)
mynd_metadata$library_batch = as.factor(mynd_metadata$library_batch)par(mfrow=c(2,2))
p = ggplot(mynd_metadata, aes(rRNA.rate, PCT_CODING_BASES))
p + geom_point() + geom_hline(yintercept = 20, color = 'blue') + geom_vline(xintercept = .4, color = 'red')
barplot(mynd_metadata$PCT_CODING_BASES, names = rownames(mynd_metadata), las=2, cex.axis = 0.8)
title("PCT CODING")
barplot(mynd_metadata$PF_READS, names=rownames(mynd_metadata), las=2, cex.axis = 0.8)
title("PASSED READS")
barplot(mynd_metadata$PERCENT_DUPLICATION, names=rownames(mynd_metadata), las=2, cex.axis = 0.8)
title("PERCENT DUPLICATION")# You don't need to run everythin again, we can load the RData from here!
#load(paste0(work_dir, "mynd.expression.RData"))
### Removing genes with low expression
### >= 30%
x = data.frame(counts, check.names = F)
cpm = cpm(x)
keep.exp = rowSums(cpm > 1) >= 0.3*ncol(x)
#These are the final tables of expression!
mynd_cpm = cpm[keep.exp, ]
mynd_abundance = tpms[keep.exp,]
mynd_counts = counts[keep.exp,]
# Normalization
norm = calcNormFactors(mynd_counts, method = "TMM") #normalized with TMM
#Creat Limma object
x <- DGEList(counts=mynd_counts, samples=mynd_metadata, norm.factors = norm)
v = voom(x)##Exploratory Analysis ### Library sizes
[1] 244
#We can also plot the library sizes as a barplot to see whether there are any major discrepancies between the samples more easily.
#png(paste0(work_plots, "library_sizes.#png"), width = 16, height = 10, res = 300, units = "in")
barplot(x$samples$lib.size, names=colnames(x), las=2, cex.axis = 0.8)
title("Barplot of library sizes")
#dev.off()# To do the MDS plot is necessary to check the order for the colors.
# Because of this we are using the ordered_metadata.
#plotMDS(x)
#png(paste0(work_plots, "MDS.#png"), width = 8, height = 6, res = 300, units = "in")
col.group <- mynd_metadata$rna_batch
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)
plotMDS(mynd_voom, col= col.group)
title("MDS by rna_batch")
getResiduals <- function(exprLine) {
ourCovRegression <- lm(exprLine ~ mynd_metadata$rna_batch + mynd_metadata$gender + mynd_metadata$age);
ourResiduals <- ourCovRegression$residuals;
return(ourResiduals)
}
allResiduals <- data.frame(t(apply(mynd_voom, 1, getResiduals)), check.names = F)
resid = as.matrix(allResiduals)
#png(paste0(work_plots, "MDSreg.png"), width = 8, height = 6, res = 300, units = "in")
plotMDS(resid, col= col.group)
#title("MDS by Status")
#dev.off()par(mfrow=c(1,2))
#PCA
mynd_voom_t = t(mynd_voom)
autoplot(prcomp(mynd_voom_t), data = mynd_metadata, colour = "rna_batch")
getResiduals <- function(exprLine) {
ourCovRegression <- lm(exprLine ~ mynd_metadata$library_batch + mynd_metadata$rin_value + mynd_metadata$rna_batch + mynd_metadata$gender + mynd_metadata$age);
ourResiduals <- ourCovRegression$residuals;
return(ourResiduals)
}
allResiduals <- data.frame(t(apply(mynd_voom, 1, getResiduals)), check.names = F)
resid = as.matrix(allResiduals)
cov = t(resid)
autoplot(prcomp(cov), data = mynd_metadata, colour = "rna_batch", label = T)
#dev.off()#Only 2 samples
#png(paste0(work_plots, "Scatter_all.#png"), width = 8, height = 6, res = 300, units = "in")
par( mfrow = c( 2,2 ))
plot(log2(mynd_counts[,1:2] + 1), pch=16, cex=0.3, main = "COUNTS")
plot(log2(mynd_cpm[,1:2] + 1), pch=16, cex=0.3, main = "CPM")
plot(log2(mynd_abundance[,1:2] + 1), pch=16, cex=0.3, main = "TPM")
plot(mynd_voom[,1:2], pch=16, cex=0.3, main = "VOOM")
#dev.off()#Density plot with all samples
# cpm table = ppmi_cpm_case_control
# log cpm table = lcpm
lcounts = log2(mynd_counts)
lcpm = log2(mynd_cpm)
ltpm = log2(mynd_abundance)
# voom use log already
nsamples <- ncol(mynd_cpm)
#samplenames <- colnames(mynd_cpm)
colfunc <- colorRampPalette(c("#4DBBD5FF", "#3C5488FF"))
col = alpha(colfunc(nsamples), alpha = 0.1)
#png(paste0(work_plots, "Density_all.#png"), width = 8, height = 6, res = 300, units = "in")
par(mfrow=c(2,2))
#COUNTS
plot(density(lcounts[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. COUNTS ", xlab="Log2(counts)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcounts[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#CPM
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. CPM ", xlab="Log2(CPM)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#TPM
plot(density(ltpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="C. TPM ", xlab="Log2(TPM)")
#abline(v=ltpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(ltpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#Voom
plot(density(mynd_voom[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="D. VOOM ", xlab="voom")
#abline(v=lvoom.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(mynd_voom[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#dev.off()###Boxplot with log + 1 Only with few samples ###
#png(paste0(work_plots, "Boxplot_all.#png"), width = 8, height = 6, res = 300, units = "in")
#Counts
boxplot(log2(mynd_counts +1), col=rainbow(20), main="Counts", ylab="Log2(Counts+1)",las=2,cex.axis=0.8)
#CPM
boxplot(log2(mynd_cpm +1), col=rainbow(20), main="CPM", ylab="Log2(CPM+1)",las=2,cex.axis=0.8)
#TPM
boxplot(log2(mynd_abundance +1), col=rainbow(20), main="TPM", ylab="Log2(TPM+1)",las=2,cex.axis=0.8)
#Voom
boxplot(mynd_voom, col=rainbow(20), main="Voom", ylab="Voom",las=2,cex.axis=0.8)
#dev.off()###Heatmaps NOT GOOD FOR 600 samples ### 8 min CHECK THE BAR COLORS
par(mfrow=c(1,1))
#Spearman
col.cell <- c('grey','black')[mynd_metadata$status]
cormatrix = rcorr(as.matrix(mynd_voom), type='spearman')
corrdata = as.matrix(cormatrix$r)
#png(paste0(work_plots, "HM_Spearman.#png"), width = 20, height = 14, res = 300, units = "in")
heatmap.2(corrdata, main="Voom Spearman",ColSideColors = col.cell, trace="none", col = c(sort(brewer.pal(9,"Blues")),brewer.pal(9,"Reds")), margins = c(1,1))
#dev.off()#Pearson
cormatrix = rcorr(as.matrix(mynd_voom), type='pearson')
corrdata = as.matrix(cormatrix$r)
#png(paste0(work_plots, "HM_Pearson.#png"), width = 20, height = 14, res = 300, units = "in")
heatmap.2(corrdata, main="Voom Pearson", ColSideColors = col.cell,trace="none", col = c(sort(brewer.pal(9,"Blues")),brewer.pal(9,"Reds")), margins = c(1,1))
#dev.off()#Euclidean distance with the transposed matrix
sampleTree = hclust(dist(mynd_voom_t), method = "average")
#png(paste0(work_plots, "tree_euclidean.#png"), width = 20, height = 10, res = 300, units = "in")
plot(sampleTree, main = "Sample clustering to detecting outliers",
sub = "Function: hclust, Method: average from Euclidean distance", xlab = "samples",
cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5, cex.sub = 1.5)
#dev.off()#Spearman correlation #10 min
#png(paste0(work_plots, "tree_Spearman.#png"), width = 20, height = 10, res = 300, units = "in")
#plot(hcluster(mynd_voom_t, method= "spearman", link="average" ),
# cex.lab = 1.5, cex.axis = 1.5, cex.main = 2, main= "Hierarchical Clustering",
# sub = "Function: hcluster, Method: Spearman", xlab = "samples")
#dev.off()total = merge(mynd_voom_t, mynd_metadata, by = 0)
total$mismatch = ""
labelIndicesWeWant <- total$gender == "M" & total$ENSG00000229807.11 > 5 | total$gender == "F" & total$ENSG00000183878.15 > .2
total$mismatch[labelIndicesWeWant] <- as.character(total$Row.names[labelIndicesWeWant])
p = ggplot(total, aes(x = ENSG00000229807.11, y = ENSG00000183878.15, color = gender, label = mismatch))
p + geom_point() + geom_text_repel() + xlab("XIST") + ylab("UTY")#png( "cell_marker_genes.#png", width = 22, height = 11, res = 300, units = "in")
my_palette <- colorRampPalette(c('Light Blue', "White", "Red"))
library(gplots)
markers = read.table('/sc/hydra/projects/pd-omics/mynd/qc/marker.genes', header = F)
total = merge(markers, mynd_abundance, by.y = 0, by.x = 'V1', sort = F)
rownames(total) = total$V2
total$V1 = NULL
total$V2 = NULL
total$Row.names = NULL
total = as.matrix(log2(total + 1))
heatmap.2(total, dendrogram="both", Rowv =T, Colv=F, cexCol = .4, trace="none", scale = "none", col=my_palette, adjRow = c(0, 0.5), margins = c(6, 8), key = TRUE, keysize = 1.3, xlab = "Samples", ylab = "Genes", main = "Expression")
#dev.off()